/******************************************************************************
Safer in School? The Impact of Compulsory Schooling on Maltreatment and Associated Harms

Description: This program cleans the raw data and constructs the main variables
necessary for the analysis. The code produces intermediate datasets based on the
various raw datasets provided by government and semi-government departments in South
Australia, and then merges these data to form the master dataset used for the
analysis presented in the paper ("final_analysis.dta"). 

Note that directory paths have been removed to preserve privacy. Please insert 
the relevant directories in the global macros under "Setup" before running the
.do file. The 'tmp' directory is only used for storing intermediate data files, and
the 'clean' directory should only contain the analytical dataset constructed at
the end of this do file ("final_analysis.dta"). This code was written and
implemented in Stata MP 19.5, with a computational time of approximately ~5 minutes.

Packages required include:
•	ftools
•	reghdfe
•	estout
•	rdrobust
•	pzms
•	rddisttestk (.ado file by Brigham Frandsen available from 
	https://sites.google.com/view/brighamfrandsen/software?authuser=0)
•	rddensity
•	rdhonest

*****************************************************************************/

*******************************************************************************
*	Setup
*******************************************************************************
clear all
set more off
macro drop all
capture log close

global raw
global tmp
global clean
global output 

* Install Stata packages (if needed)
// ssc install ftools, replace
// ssc install reghdfe, replace
// ssc install estout, replace
// ssc install rdrobust, replace
// ssc install pzms, replace
// ssc install rddensity, replace
// capture ado uninstall rdhonest
// net install rdhonest, from("https://raw.githubusercontent.com/tbarmstr/RDHonest-vStata/master/")

*******************************************************************************
*	Section 1: Clean raw education data
*******************************************************************************

* Import education file into Stata (originally .sav) and save as .dta file
import spss "${raw}educ.sav", case(lower) clear
save "${raw}educ.dta", replace

* Read education .dta file
use "${raw}educ.dta", clear

* Rename individual identifiers to pslk3 for consistency with other datasets
rename pslk pslk3

* Encode 'exitreason' (string variable) into 'num_exitreason' (numeric variable)
encode exitreason, gen(num_exitreason)
tab num_exitreason

/*
  Regrouping different exit reasons:
  - Grouped vocational education and private training institution (1 = 3)
  - Grouped leaving SA categories into one category (9/15 = 8)
  - Grouped employment categories into one category (17 = 19)
  - Replaced with NULL (i.e. no exit) if transferred between government schools (21 = 16)
*/
recode num_exitreason (1 = 3) (9/15 = 8) (17 = 19) (21 = 16)

* Recode again so remaining variables are grouped together in sequence
recode num_exitreason (16 = 1) (18 = 9) (19 = 10) (20 = 11) (22 = 12)

* Label encoded exit reason variable
label define num_exitreason 1 "[1] NULL" 2 "[2] Attending non-government school" ///
	3 "[3] Attending VET" 4 "[4] Attending university" 5 "[5] Deceased" ///
	6 "[6] Exemption" 7 "[7] Illness" 8 "[8] Left SA" 9 "[9] Parenting/Carer" ///
	10 "[10] Employment" 11 "[11] Suspension/Exclusion" 12 "[12] Unknown" ///
	13 "[13] No exit record", modify
tab num_exitreason

* Replace exitreason with NULL (i.e. no exit) if transferred between government schools
replace exitreason = "NULL" if num_exitreason == 1

* Change leave date to missing (i.e. no exit) for school transfers previously recoded to NULL
replace leave_date = . if exitreason == "NULL"

* Reformat "leave_date" from a clock variable to a date variable
replace leave_date = dofc(leave_date)
format leave_date %d

* Identifying permanent leave date for all observations
bysort pslk3: egen date_left = max(leave_date)
format date_left %d

* Identifying specific row entries corresponding to the permanent leave date
gen leave_date_perm = date_left if leave_date == date_left
format leave_date_perm %d

*** Flag individuals with no recorded leave date (excluding school transfers)
bysort pslk3: egen last_censusyear = max(censusyear)
replace last_censusyear = . if censusyear != last_censusyear

* Flag non-problematic entries (i.e. permanent leave date exists)
gen prob_date = 1 if last_censusyear == censusyear & leave_date_perm != .
bysort pslk3: egen flag_date = sum(prob_date)
	
/*
   Recode entries where there is no permanent leave date. We asssume that in this
   case, individuals left the public school system at the end of the calendar year,
   in the last year they are observed in the school census.
*/
replace leave_date = mdy(12, 31, last_censusyear) ///
	if last_censusyear == censusyear & flag_date == 0 & leave_date_perm == .
replace leave_date_perm = mdy(12, 31, last_censusyear) ///
	if last_censusyear == censusyear & flag_date == 0 & leave_date_perm == .

* Recreate variable for exit date from the school system based on cleaned leave dates
drop date_left
bysort pslk3: egen date_left = max(leave_date_perm)
format date_left %d

* Merge with cohort file to get date of birth and then generate birthdays up to age 22
merge m:1 pslk3 using "${raw}ican_cohall.dta", keepusing(dob)
drop _merge

gen monthofbirth = month(dob)
gen yearofbirth = year(dob)

forval x = 15/22{
	gen date`x'y = year(dob) + `x'
	gen date`x'm = month(dob)
	gen date`x'd = day(dob)
	
	gen date`x' = mdy(date`x'm, date`x'd, date`x'y)
	* Shift birthday to March 1st in non-leap years for children born on 29th February
	replace date`x' = mdy(3, 1, date`x'y) if date`x' == .
	format date`x' %d
	
	gen year`x' = yearofbirth + `x'
	
	drop date`x'y
	drop date`x'm
	drop date`x'd
}

/*
  Construct variables for school enrolment, based on whether children are still 
  enrolled in public school for at least ~6 months after turning 16 or 17 years old
*/
* Generate dates for the 6th month after children's 16th or 17th birthday
gen age16_cutoff = .

forval x = 1/6{
	replace age16_cutoff = mdy(`x', 1, year17) if monthofbirth == `x' + 6
}

forval x = 7/12{
	replace age16_cutoff = mdy(`x', 1, year16) if monthofbirth == `x' - 6
}

gen age17_cutoff = .

forval x = 1/6{
	replace age17_cutoff = mdy(`x', 1, year18) if monthofbirth == `x' + 6
}

forval x = 7/12{
	replace age17_cutoff = mdy(`x', 1, year17) if monthofbirth == `x' - 6
}

/*
  Generate variable indicating school enrolment at age 16, noting that:
  1. Children who 'left' to transfer schools during at age 15+ are counted as 'enrolled'
  2. Children who left school because they completed high school (i.e. Year 12)
     are counted as 'enrolled'
*/
gen enrol16 = (date_left >= age16_cutoff)

* Flag children who 'left' because they transferred schools or completed high school
gen leave16_sch = 1 if leave_date_perm >= date15 & leave_date_perm < age16_cutoff ///
	& num_exitreason == 2
replace leave16_sch = 1 if leave_date_perm < age16_cutoff & ///
	month(leave_date_perm) == 12 & school_year_level == "12"
replace leave16_sch = 0 if leave_date_perm == .
bysort pslk: egen ind_leave16_sch = sum(leave16_sch)

replace enrol16 = 1 if ind_leave16_sch > 0

/*
  Generate variable indicating school enrolment at age 17, noting that:
  1. Children who 'left' to transfer schools at age 15+ are counted as 'enrolled'
  2. Children who left school because they completed high school (i.e. Year 12)
     are counted as 'enrolled'
*/
gen enrol17 = (date_left >= age17_cutoff)

* Flag children who 'left' because they transferred schools or completed high school
gen leave17_sch = 1 if leave_date_perm >= date15 & leave_date_perm < age17_cutoff ///
	& num_exitreason == 2
replace leave17_sch = 1 if leave_date_perm < age17_cutoff & ///
	month(leave_date_perm) == 12 & school_year_level == "12"
replace leave17_sch = 0 if leave_date_perm == .
bysort pslk: egen ind_leave17_sch = sum(leave17_sch)

replace enrol17 = 1 if ind_leave17_sch > 0 

/*
  Generate variables indicating the reasons for which children left school at
  age 16, including:
  1. Leaving school to attend vocational education and training (VET)
  2. Leaving school for employment
  3. Leaving school for all other reasons (i.e. neither VET nor employment)
*/

* Leaving school for VET
gen vet16 = (num_exitreason == 3 & enrol16 == 0 & leave_date_perm != .) 
bysort pslk: egen leave16_vet = max(vet16)

* Leaving school for employment
gen employment16 = (num_exitreason == 10 & enrol16 == 0 & leave_date_perm != .) 
bysort pslk3: egen leave16_employment = max(employment16)

* Leaving school for neither VET nor employment
gen other16 = ((num_exitreason != 3 & num_exitreason != 10) & enrol16 == 0 & leave_date_perm != .) 
bysort pslk: egen leave16_other = max(other16)

* Generate indicator for children who have a census record in the year of 16th birthday:
gen census15 = 1 if censusyear == year16
replace census15 = 1 if censusyear == year16 & yearofbirth == 1989

bysort pslk: egen rec_census15 = max(census15)

/*
  We now restrict the sample to children with census records in the year they
  turn 16 years old. This excludes children not in the public school system at 
  age 15 (e.g. children in private/catholic schools for whom we don't observe
  any schooling information, and children who migrated from SA).
*/
keep if rec_census15 == 1

* Collapse by individual ID
collapse rec_census15 enrol16 enrol17 leave16_vet leave16_employment leave16_other, by(pslk3)

* Save as intermediate data file
save "${tmp}educ_intermediate.dta", replace

*******************************************************************************
*	Section 2: Clean raw Child Protection Services (CPS) data
*******************************************************************************

* Import CPS datasets into Stata (originally .sav files) and save as .dta files
import spss "${raw}Merged_CP_notifications_all_variables_FINAL.sav", case(lower) clear
save "${raw}Merged_CP_notifications_all_variables_FINAL.dta", replace

import spss "${raw}Merged_CP_OOHC_PSLK3_Final.sav", case(lower) clear
save "${raw}Merged_CP_OOHC_PSLK3_Final.dta", replace

* Load CPS notifications data and append with out-of-home-care data
use "${raw}Merged_CP_notifications_all_variables_FINAL.dta", clear
append using "${raw}Merged_CP_OOHC_PSLK3_Final.dta", keep(pslk3 cleaned_placement_start_date) nolabel

* Merge with cohort file to get date of birth and then generate birthdays up to age 22
merge m:1 pslk3 using "${raw}ican_cohall.dta", keepusing(dob)
drop _merge

forval x = 15/22{
	gen date`x'y = year(dob) + `x'
	gen date`x'm = month(dob)
	gen date`x'd = day(dob)
	
	gen date`x' = mdy(date`x'm, date`x'd, date`x'y)
	* Shift birthday to March 1st in non-leap years for children born on 29th February
	replace date`x' = mdy(3, 1, date`x'y) if date`x' == .
	format date`x' %d
	
	drop date`x'y
	drop date`x'm
	drop date`x'd
}

* Reformat date-related variables to dates in Stata
local varlist date_notification cleaned_placement_start_date investigation_start_date investigation_end_date

foreach var in `varlist'{
	replace `var' = dofc(`var')
	format `var' %td
}

* Rename 'sex' (i.e. sexual abuse) to 'sexual', to avoid confusion with sex at birth
rename sex sexual

* Generate variable for CPS history up to age 16
bysort pslk3: egen first_notification = min(date_notification)
bysort pslk3: egen first_placement = min(cleaned_placement_start_date)

format first_notification first_placement %td

gen cps_age16_binary = (first_notification < date16 | first_placement < date16)

* Flag CPS notifications occurring after children's 16th birthday
gen notif = (date_notification >= date16 & date_notification != .)

* Construct variable for experiencing a CPS notification after 16th birthday
bysort pslk3: egen cps_after16_binary = max(notif)

/*
  Construct non-exclusive categories of alleged maltreatment after 16th birthday:
  emotional, physical, sexual, neglect
*/

local varlist emo neg phy sexual

foreach y in `varlist'{
	gen notif_`y' = (notif == 1 & `y' == 1)
	bysort pslk3: egen cps_after16_`y' = max(notif_`y')
}

/*
  Construct variables for new CPS notifications reported by: 
  1. School-based notifiers
  2. All other notifiers (i.e. non-school)
*/

gen school = 1 if reporter_source == "SCH"
gen nonschool = 1 if reporter_source != "SCH"

local varlist school nonschool

foreach y in `varlist'{
	gen notif_`y' = (notif == 1 & `y' == 1)
	bysort pslk3: egen cps_after16_`y' = max(notif_`y')
}

/*
  Construct variables for past CPS notifications (up to age 16) reported by:
  1. School-based notifier vs. non-school notifiers
  2. All other notifiers (i.e. non-school)
*/
local varlist school nonschool

foreach y in `varlist'{
	gen pastnotif_`y' = (date_notification < date16 & `y' == 1)
	bysort pslk3: egen cps_age16_`y' = max(pastnotif_`y')
	replace cps_age16_`y' = 0 if cps_age16_binary == 0
}

* Collapse to individual ID, with final variables
collapse cps_age16_* cps_after16_*, by(pslk3)

* Save as intermediate data file
save "${tmp}cps_intermediate.dta", replace

*******************************************************************************
*	Section 3: Clean raw emergency department (ED) data
*******************************************************************************

* Load emergency department (ED) data file
use "${raw}ican_ed_coh_cp.dta", clear

* Merge with cohort file to get date of birth and then generate birthdays up to age 22
merge m:1 pslk3 using "${raw}ican_cohall.dta", keepusing(dob)
drop _merge

forval x = 15/22{
	gen date`x'y = year(dob) + `x'
	gen date`x'm = month(dob)
	gen date`x'd = day(dob)
	
	gen date`x' = mdy(date`x'm, date`x'd, date`x'y)
	* Shift birthday to March 1st in non-leap years for children born on 29th February
	replace date`x' = mdy(3, 1, date`x'y) if date`x' == .
	format date`x' %d
	
	drop date`x'y
	drop date`x'm
	drop date`x'd
}

/*
  In most cases, we do not have the exact day of the ED presentation, so we assume 
  the ED presentation occurrred in the middle of the month (i.e. on the 15th) and
  adjust the admission dates accordingly. By default, the ED data records the day
  of admission as the 14th.
*/
replace e_my_p = mdy(e_p_m, 15, e_p_y)

**** Labelling and grouping ED visits based on their ICD-10 codes
gen e_diag2 = ""

* For ICD-10 codes starting with S
forval x = 0/9{
	replace e_diag2 = "S`x'0-S`x'9" if e_diag3 >= "S`x'0" & e_diag3 <= "S`x'9"
}

* For ICD-10 codes starting with T
replace e_diag2 = "T00-T07" if e_diag3 >= "T00" & e_diag3 <= "T07"
replace e_diag2 = "T08-T14" if e_diag3 >= "T08" & e_diag3 <= "T14"
replace e_diag2 = "T15-T19" if e_diag3 >= "T15" & e_diag3 <= "T19"
replace e_diag2 = "T20-T32" if e_diag3 >= "T20" & e_diag3 <= "T32"
replace e_diag2 = "T33-T35" if e_diag3 >= "T33" & e_diag3 <= "T35"
replace e_diag2 = "T36-T50" if e_diag3 >= "T36" & e_diag3 <= "T50"
replace e_diag2 = "T51-T65" if e_diag3 >= "T51" & e_diag3 <= "T65"
replace e_diag2 = "T66-T78" if e_diag3 >= "T66" & e_diag3 <= "T78"
replace e_diag2 = "T79-T79" if e_diag3 >= "T79" & e_diag3 <= "T79"
replace e_diag2 = "T80-T88" if e_diag3 >= "T80" & e_diag3 <= "T88"
replace e_diag2 = "T90-T98" if e_diag3 >= "T90" & e_diag3 <= "T98"

* For ICD-10 codes for injuries V01-X59
replace e_diag2 = "V01-X59" if e_diag3 >= "V01" & e_diag3 <= "X59"

*** A codes
replace e_diag2 = "A00-A09" if e_diag3 >= "A00" & e_diag3 <= "A09"
replace e_diag2 = "A15-A19" if e_diag3 >= "A15" & e_diag3 <= "A19"
replace e_diag2 = "A20-A28" if e_diag3 >= "A20" & e_diag3 <= "A28"
replace e_diag2 = "A30-A49" if e_diag3 >= "A30" & e_diag3 <= "A49"
replace e_diag2 = "A50-A64" if e_diag3 >= "A50" & e_diag3 <= "A64"
replace e_diag2 = "A65-A69" if e_diag3 >= "A65" & e_diag3 <= "A69"
replace e_diag2 = "A70-A74" if e_diag3 >= "A70" & e_diag3 <= "A74"
replace e_diag2 = "A75-A79" if e_diag3 >= "A75" & e_diag3 <= "A79"
replace e_diag2 = "A80-A89" if e_diag3 >= "A80" & e_diag3 <= "A89"
replace e_diag2 = "A92-A99" if e_diag3 >= "A92" & e_diag3 <= "A99"

*** B codes
replace e_diag2 = "B00-B09" if e_diag3 >= "B00" & e_diag3 <= "B09"
replace e_diag2 = "B15-B19" if e_diag3 >= "B15" & e_diag3 <= "B19"
replace e_diag2 = "B20-B24" if e_diag3 >= "B20" & e_diag3 <= "B24"
replace e_diag2 = "B25-B34" if e_diag3 >= "B25" & e_diag3 <= "B34"
replace e_diag2 = "B35-B49" if e_diag3 >= "B35" & e_diag3 <= "B49"
replace e_diag2 = "B50-B64" if e_diag3 >= "B50" & e_diag3 <= "B64"
replace e_diag2 = "B65-B83" if e_diag3 >= "B65" & e_diag3 <= "B83"
replace e_diag2 = "B85-B89" if e_diag3 >= "B85" & e_diag3 <= "B89"
replace e_diag2 = "B90-B94" if e_diag3 >= "B90" & e_diag3 <= "B94"
replace e_diag2 = "B95-B98" if e_diag3 >= "B95" & e_diag3 <= "B98"
replace e_diag2 = "B99-B99" if e_diag3 == "B99"

*** C codes
replace e_diag2 = "C00-C97" if e_diag3 >= "C00" & e_diag3 <= "C97"

*** D codes
replace e_diag2 = "D00-D09" if e_diag3 >= "D00" & e_diag3 <= "D09"
replace e_diag2 = "D10-D36" if e_diag3 >= "D10" & e_diag3 <= "D36"
replace e_diag2 = "D37-D48" if e_diag3 >= "D37" & e_diag3 <= "D48"
replace e_diag2 = "D50-D53" if e_diag3 >= "D50" & e_diag3 <= "D53"
replace e_diag2 = "D55-D59" if e_diag3 >= "D55" & e_diag3 <= "D59"
replace e_diag2 = "D60-D64" if e_diag3 >= "D60" & e_diag3 <= "D64"
replace e_diag2 = "D65-D69" if e_diag3 >= "D65" & e_diag3 <= "D69"
replace e_diag2 = "D70-D77" if e_diag3 >= "D70" & e_diag3 <= "D77"
replace e_diag2 = "D80-D89" if e_diag3 >= "D80" & e_diag3 <= "D89"

*** E codes
replace e_diag2 = "E00-E07" if e_diag3 >= "E00" & e_diag3 <= "E07"
replace e_diag2 = "E10-E14" if e_diag3 >= "E10" & e_diag3 <= "E14"
replace e_diag2 = "E15-E16" if e_diag3 >= "E15" & e_diag3 <= "E16"
replace e_diag2 = "E20-E35" if e_diag3 >= "E20" & e_diag3 <= "E35"
replace e_diag2 = "E40-E46" if e_diag3 >= "E40" & e_diag3 <= "E46"
replace e_diag2 = "E50-E64" if e_diag3 >= "E50" & e_diag3 <= "E64"
replace e_diag2 = "E65-E68" if e_diag3 >= "E65" & e_diag3 <= "E68"
replace e_diag2 = "E70-E90" if e_diag3 >= "E70" & e_diag3 <= "E90"

*** F codes
forval x = 0/8{
	replace e_diag2 = "F`x'0-F`x'9" if e_diag3 >= "F`x'0" & e_diag3 <= "F`x'9"
}
replace e_diag2 = "F40-F48" if e_diag3 >= "F40" & e_diag3 <= "F48"
replace e_diag2 = "F90-F98" if e_diag3 >= "F90" & e_diag3 <= "F98"
replace e_diag2 = "F99-F99" if e_diag3 == "F99"

*** G codes
replace e_diag2 = "G00-G09" if e_diag3 >= "G00" & e_diag3 <= "G09"
replace e_diag2 = "G10-G14" if e_diag3 >= "G10" & e_diag3 <= "G14"
replace e_diag2 = "G20-G26" if e_diag3 >= "G20" & e_diag3 <= "G26"
replace e_diag2 = "G30-G32" if e_diag3 >= "G30" & e_diag3 <= "G32"
replace e_diag2 = "G35-G37" if e_diag3 >= "G35" & e_diag3 <= "G37"
replace e_diag2 = "G40-G47" if e_diag3 >= "G40" & e_diag3 <= "G47"
replace e_diag2 = "G50-G59" if e_diag3 >= "G50" & e_diag3 <= "G59"
replace e_diag2 = "G60-G64" if e_diag3 >= "G60" & e_diag3 <= "G64"
replace e_diag2 = "G70-G73" if e_diag3 >= "G70" & e_diag3 <= "G73"
replace e_diag2 = "G80-G83" if e_diag3 >= "G80" & e_diag3 <= "G83"
replace e_diag2 = "G90-G99" if e_diag3 >= "G90" & e_diag3 <= "G99"

*** H codes
replace e_diag2 = "H00-H06" if e_diag3 >= "H00" & e_diag3 <= "H06"
replace e_diag2 = "H10-H13" if e_diag3 >= "H10" & e_diag3 <= "H13"
replace e_diag2 = "H15-H22" if e_diag3 >= "H15" & e_diag3 <= "H22"
replace e_diag2 = "H25-H28" if e_diag3 >= "H25" & e_diag3 <= "H28"
replace e_diag2 = "H30-H36" if e_diag3 >= "H30" & e_diag3 <= "H36"
replace e_diag2 = "H40-H42" if e_diag3 >= "H40" & e_diag3 <= "H42"
replace e_diag2 = "H43-H45" if e_diag3 >= "H43" & e_diag3 <= "H45"
replace e_diag2 = "H46-H48" if e_diag3 >= "H46" & e_diag3 <= "H48"
replace e_diag2 = "H49-H52" if e_diag3 >= "H49" & e_diag3 <= "H52"
replace e_diag2 = "H53-H54" if e_diag3 >= "H53" & e_diag3 <= "H54"
replace e_diag2 = "H55-H59" if e_diag3 >= "H55" & e_diag3 <= "H59"
replace e_diag2 = "H60-H62" if e_diag3 >= "H60" & e_diag3 <= "H62"
replace e_diag2 = "H65-H75" if e_diag3 >= "H65" & e_diag3 <= "H75"
replace e_diag2 = "H80-H83" if e_diag3 >= "H80" & e_diag3 <= "H83"
replace e_diag2 = "H90-H95" if e_diag3 >= "H90" & e_diag3 <= "H95"

*** I codes
replace e_diag2 = "I00-I02" if e_diag3 >= "I00" & e_diag3 <= "I02"
replace e_diag2 = "I05-I09" if e_diag3 >= "I05" & e_diag3 <= "I09"
replace e_diag2 = "I10-I15" if e_diag3 >= "I10" & e_diag3 <= "I15"
replace e_diag2 = "I20-I25" if e_diag3 >= "I20" & e_diag3 <= "I25"
replace e_diag2 = "I26-I28" if e_diag3 >= "I26" & e_diag3 <= "I28"
replace e_diag2 = "I30-I52" if e_diag3 >= "I30" & e_diag3 <= "I52"
forval x = 6/8{
	replace e_diag2 = "I`x'0-I`x'9" if e_diag3 >= "I`x'0" & e_diag3 <= "I`x'9"
}
replace e_diag2 = "I95-I99" if e_diag3 >= "I95" & e_diag3 <= "I99"

*** J codes
replace e_diag2 = "J00-J06" if e_diag3 >= "J00" & e_diag3 <= "J06"
replace e_diag2 = "J09-J18" if e_diag3 >= "J09" & e_diag3 <= "J18"
replace e_diag2 = "J20-J22" if e_diag3 >= "J20" & e_diag3 <= "J22"
replace e_diag2 = "J30-J39" if e_diag3 >= "J30" & e_diag3 <= "J39"
replace e_diag2 = "J40-J47" if e_diag3 >= "J40" & e_diag3 <= "J47"
replace e_diag2 = "J60-J70" if e_diag3 >= "J60" & e_diag3 <= "J70"
replace e_diag2 = "J80-J84" if e_diag3 >= "J80" & e_diag3 <= "J84"
replace e_diag2 = "J85-J86" if e_diag3 >= "J85" & e_diag3 <= "J86"
replace e_diag2 = "J90-J94" if e_diag3 >= "J90" & e_diag3 <= "J94"
replace e_diag2 = "J95-J99" if e_diag3 >= "J95" & e_diag3 <= "J99"

*** K codes
replace e_diag2 = "K00-K14" if e_diag3 >= "K00" & e_diag3 <= "K14"
replace e_diag2 = "K20-K31" if e_diag3 >= "K20" & e_diag3 <= "K31"
replace e_diag2 = "K35-K38" if e_diag3 >= "K35" & e_diag3 <= "K38"
replace e_diag2 = "K40-K46" if e_diag3 >= "K40" & e_diag3 <= "K46"
replace e_diag2 = "K50-K52" if e_diag3 >= "K50" & e_diag3 <= "K52"
replace e_diag2 = "K55-K64" if e_diag3 >= "K55" & e_diag3 <= "K64"
replace e_diag2 = "K65-K67" if e_diag3 >= "K65" & e_diag3 <= "K67"
replace e_diag2 = "K70-K77" if e_diag3 >= "K70" & e_diag3 <= "K77"
replace e_diag2 = "K80-K87" if e_diag3 >= "K80" & e_diag3 <= "K87"
replace e_diag2 = "K90-K93" if e_diag3 >= "K90" & e_diag3 <= "K93"

*** L codes
replace e_diag2 = "L00-L08" if e_diag3 >= "L00" & e_diag3 <= "L08"
replace e_diag2 = "L10-L14" if e_diag3 >= "L10" & e_diag3 <= "L14"
replace e_diag2 = "L20-L30" if e_diag3 >= "L20" & e_diag3 <= "L30"
replace e_diag2 = "L40-L45" if e_diag3 >= "L40" & e_diag3 <= "L45"
replace e_diag2 = "L50-L54" if e_diag3 >= "L50" & e_diag3 <= "L54"
replace e_diag2 = "L55-L59" if e_diag3 >= "L55" & e_diag3 <= "L59"
replace e_diag2 = "L60-L75" if e_diag3 >= "L60" & e_diag3 <= "L75"
replace e_diag2 = "L80-L99" if e_diag3 >= "L80" & e_diag3 <= "L99"

*** M codes
replace e_diag2 = "M00-M25" if e_diag3 >= "M00" & e_diag3 <= "M25"
replace e_diag2 = "M30-M36" if e_diag3 >= "M30" & e_diag3 <= "M36"
replace e_diag2 = "M40-M54" if e_diag3 >= "M40" & e_diag3 <= "M54"
replace e_diag2 = "M60-M79" if e_diag3 >= "M60" & e_diag3 <= "M79"
replace e_diag2 = "M80-M94" if e_diag3 >= "M80" & e_diag3 <= "M94"
replace e_diag2 = "M95-M99" if e_diag3 >= "M95" & e_diag3 <= "M99"

*** N codes
replace e_diag2 = "N00-N08" if e_diag3 >= "N00" & e_diag3 <= "N08"
replace e_diag2 = "N10-N16" if e_diag3 >= "N10" & e_diag3 <= "N16"
replace e_diag2 = "N17-N19" if e_diag3 >= "N17" & e_diag3 <= "N19"
replace e_diag2 = "N20-N23" if e_diag3 >= "N20" & e_diag3 <= "N23"
replace e_diag2 = "N25-N29" if e_diag3 >= "N25" & e_diag3 <= "N29"
replace e_diag2 = "N30-N39" if e_diag3 >= "N30" & e_diag3 <= "N39"
replace e_diag2 = "N40-N51" if e_diag3 >= "N40" & e_diag3 <= "N51"
replace e_diag2 = "N60-N64" if e_diag3 >= "N60" & e_diag3 <= "N64"
replace e_diag2 = "N70-N77" if e_diag3 >= "N70" & e_diag3 <= "N77"
replace e_diag2 = "N80-N98" if e_diag3 >= "N80" & e_diag3 <= "N98"
replace e_diag2 = "N99-N99" if e_diag3 == "N99"

*** O codes
replace e_diag2 = "O00-O08" if e_diag3 >= "O00" & e_diag3 <= "O08"
replace e_diag2 = "O10-O16" if e_diag3 >= "O10" & e_diag3 <= "O16"
replace e_diag2 = "O20-O29" if e_diag3 >= "O20" & e_diag3 <= "O29"
replace e_diag2 = "O30-O48" if e_diag3 >= "O30" & e_diag3 <= "O48"
replace e_diag2 = "O60-O75" if e_diag3 >= "O60" & e_diag3 <= "O75"
replace e_diag2 = "O80-O84" if e_diag3 >= "O80" & e_diag3 <= "O84"
replace e_diag2 = "O85-O92" if e_diag3 >= "O85" & e_diag3 <= "O92"
replace e_diag2 = "O94-O99" if e_diag3 >= "O94" & e_diag3 <= "O99"

*** P codes
replace e_diag2 = "P00-P04" if e_diag3 >= "P00" & e_diag3 <= "P04"
replace e_diag2 = "P05-P08" if e_diag3 >= "P05" & e_diag3 <= "P08"
replace e_diag2 = "P10-P15" if e_diag3 >= "P10" & e_diag3 <= "P15"
replace e_diag2 = "P20-P29" if e_diag3 >= "P20" & e_diag3 <= "P29"
replace e_diag2 = "P35-P39" if e_diag3 >= "P35" & e_diag3 <= "P39"
replace e_diag2 = "P50-P61" if e_diag3 >= "P50" & e_diag3 <= "P61"
replace e_diag2 = "P70-P74" if e_diag3 >= "P70" & e_diag3 <= "P74"
replace e_diag2 = "P75-P78" if e_diag3 >= "P75" & e_diag3 <= "P78"
replace e_diag2 = "P80-P83" if e_diag3 >= "P80" & e_diag3 <= "P83"
replace e_diag2 = "P90-P96" if e_diag3 >= "P90" & e_diag3 <= "P96"

*** Q codes
replace e_diag2 = "Q00-Q07" if e_diag3 >= "Q00" & e_diag3 <= "Q07"
replace e_diag2 = "Q10-Q18" if e_diag3 >= "Q10" & e_diag3 <= "Q18"
replace e_diag2 = "Q20-Q28" if e_diag3 >= "Q20" & e_diag3 <= "Q28"
replace e_diag2 = "Q30-Q34" if e_diag3 >= "Q30" & e_diag3 <= "Q34"
replace e_diag2 = "Q35-Q37" if e_diag3 >= "Q38" & e_diag3 <= "Q45"
replace e_diag2 = "Q50-Q56" if e_diag3 >= "Q50" & e_diag3 <= "Q56"
replace e_diag2 = "Q60-Q64" if e_diag3 >= "Q60" & e_diag3 <= "Q64"
replace e_diag2 = "Q65-Q79" if e_diag3 >= "Q65" & e_diag3 <= "Q79"
replace e_diag2 = "Q80-Q89" if e_diag3 >= "Q80" & e_diag3 <= "Q89"
replace e_diag2 = "Q90-Q99" if e_diag3 >= "Q90" & e_diag3 <= "Q99"

*** R codes
replace e_diag2 = "R00-R09" if e_diag3 >= "R00" & e_diag3 <= "R09"
replace e_diag2 = "R10-R19" if e_diag3 >= "R10" & e_diag3 <= "R19"
replace e_diag2 = "R20-R23" if e_diag3 >= "R20" & e_diag3 <= "R23"
replace e_diag2 = "R25-R29" if e_diag3 >= "R25" & e_diag3 <= "R29"
replace e_diag2 = "R30-R39" if e_diag3 >= "R30" & e_diag3 <= "R39"
replace e_diag2 = "R40-R46" if e_diag3 >= "R40" & e_diag3 <= "R46"
replace e_diag2 = "R47-R49" if e_diag3 >= "R47" & e_diag3 <= "R49"
replace e_diag2 = "R50-R69" if e_diag3 >= "R50" & e_diag3 <= "R69"
replace e_diag2 = "R70-R79" if e_diag3 >= "R70" & e_diag3 <= "R79"
replace e_diag2 = "R80-R82" if e_diag3 >= "R80" & e_diag3 <= "R82"
replace e_diag2 = "R83-R89" if e_diag3 >= "R83" & e_diag3 <= "R89"
replace e_diag2 = "R90-R94" if e_diag3 >= "R90" & e_diag3 <= "R94"
replace e_diag2 = "R95-R99" if e_diag3 >= "R95" & e_diag3 <= "R99"

*** U codes
replace e_diag2 = "U00-U49" if e_diag3 >= "U00" & e_diag3 <= "U49"
replace e_diag2 = "U82-U85" if e_diag3 >= "U82" & e_diag3 <= "U85"

*** X and Y codes
replace e_diag2 = "X60-X84" if e_diag3 >= "X60" & e_diag3 <= "X84"
replace e_diag2 = "X85-Y09" if e_diag3 >= "X85" & e_diag3 <= "Y09"
replace e_diag2 = "Y10-Y34" if e_diag3 >= "Y10" & e_diag3 <= "Y34"
replace e_diag2 = "Y35-Y36" if e_diag3 >= "Y35" & e_diag3 <= "Y36"
replace e_diag2 = "Y40-Y84" if e_diag3 >= "Y40" & e_diag3 <= "Y84"
replace e_diag2 = "Y85-Y89" if e_diag3 >= "Y85" & e_diag3 <= "Y89"
replace e_diag2 = "Y90-Y98" if e_diag3 >= "Y90" & e_diag3 <= "Y98"

*** Z codes
replace e_diag2 = "Z00-Z13" if e_diag3 >= "Z00" & e_diag3 <= "Z13"
replace e_diag2 = "Z20-Z29" if e_diag3 >= "Z20" & e_diag3 <= "Z29"
replace e_diag2 = "Z30-Z39" if e_diag3 >= "Z30" & e_diag3 <= "Z39"
replace e_diag2 = "Z40-Z54" if e_diag3 >= "Z40" & e_diag3 <= "Z54"
replace e_diag2 = "Z55-Z65" if e_diag3 >= "Z55" & e_diag3 <= "Z65"
replace e_diag2 = "Z70-Z76" if e_diag3 >= "Z70" & e_diag3 <= "Z76"
replace e_diag2 = "Z80-Z99" if e_diag3 >= "Z80" & e_diag3 <= "Z99"

*** ICD-10 broad groupings
gen e_diag1 = ""

* I - Certain infectious and parasitic diseases
replace e_diag1 = "I" if e_diag3 >= "A00" & e_diag3 <= "B99"

* II - Neoplasms
replace e_diag1 = "II" if e_diag3 >= "C00" & e_diag3 <= "D48"

* III - Diseases of the blood and blood-forming organs
replace e_diag1 = "III" if e_diag3 >= "D50" & e_diag3 <= "D89"

* IV - Endocrine, nutritional and metabolic diseases
replace e_diag1 = "IV" if e_diag3 >= "E00" & e_diag3 <= "E90"

* V - Mental and behavioural disorders
replace e_diag1 = "V" if e_diag3 >= "F00" & e_diag3 <= "F99"

* VI - Diseases of the nervous system
replace e_diag1 = "VI" if e_diag3 >= "G00" & e_diag3 <= "G99"

* VII - Diseases of the eye and adnexa
replace e_diag1 = "VII" if e_diag3 >= "H00" & e_diag3 <= "H59"

* VIII - Diseases of the ear and mastoid process
replace e_diag1 = "VIII" if e_diag3 >= "H60" & e_diag3 <= "H95"

* IX - Diseases of the circulatory system
replace e_diag1 = "IX" if e_diag3 >= "I00" & e_diag3 <= "I99"

* X - Diseases of the respiratory system
replace e_diag1 = "X" if e_diag3 >= "J00" & e_diag3 <= "J99"

* XI - Diseases of the digestive system
replace e_diag1 = "XI" if e_diag3 >= "K00" & e_diag3 <= "K93"

* XII - Diseases of the skin and subcutaneous tissue
replace e_diag1 = "XII" if e_diag3 >= "L00" & e_diag3 <= "L99"

* XIII - Diseases of the musculoskeletal system and connective tissue
replace e_diag1 = "XIII" if e_diag3 >= "M00" & e_diag3 <= "M99"

* XIV - Diseases of the genitourinary system
replace e_diag1 = "XIV" if e_diag3 >= "N00" & e_diag3 <= "N99"

* XV - Pregnancy, childbirth and the puerperium
replace e_diag1 = "XV" if e_diag3 >= "O00" & e_diag3 <= "O99"

* XVI - Certain conditions originating in the perinatal period
replace e_diag1 = "XVI" if e_diag3 >= "P00" & e_diag3 <= "P96"

* XVII - Congenital malformations, deformations and chromosomal abnormalities
replace e_diag1 = "XVII" if e_diag3 >= "Q00" & e_diag3 <= "Q99"

* XVIII - Symptoms, signs and abnormal clinical and laboratory findings
replace e_diag1 = "XVIII" if e_diag3 >= "R00" & e_diag3 <= "R99"

* XIX - Injury, poisoning and certain other consequences of external causes
replace e_diag1 = "XIX" if e_diag3 >= "S00" & e_diag3 <= "T98"

* XX - External causes of morbidity and mortality
replace e_diag1 = "XX" if e_diag3 >= "V01" & e_diag3 <= "Y98"

* XXI - Factors influencing health status and contact with health services
replace e_diag1 = "XXI" if e_diag3 >= "Z00" & e_diag3 <= "Z99"

* XXII - Codes for special purposes
replace e_diag1 = "XXII" if e_diag3 >= "U00" & e_diag3 <= "U85"

**** All-cause ED visits
* Construct variable for any ED visit at age 16
gen edvisit16 = (e_my_p >= date16 & e_my_p < date17)
bysort pslk3: egen evered16 = max(edvisit16)

* Construct variable for any ED visit between ages 17 and 21
gen edvisit_17to21 = (e_my_p >= date17 & e_my_p < date21)
bysort pslk3: egen evered_17to21 = max(edvisit_17to21)

/*
   Now we generate variables for cause-specific ED visits at age 16 based on
   ICD-10 codes and Major Diagnostic Blocks (MDB), grouped into mental illness,
   injuries, digestive issues, and all others (see also Gnanamanickam et al. (2022)
   for the grouping of diagnosis codes into the above categories in the ED dataset)
*/

**** ED visits related to mental illness
* Construct variable for any ED visit at age 16 for mental illness
gen edment16 = (e_my_p >= date16 & e_my_p < date17 ///
	& (diag_harm_pois == 1 | diag_alc == 1 | diag_ment == 1))
bysort pslk3: egen everedment16 = max(edment16)

/*
  Construct finer sub-categories of mental-health-related ED visits:
  1. Self-harm and psychiatric disorders (not drug/alcohol induced)
  2. Alcohol/drug abuse and poisonings
*/

* For self-harm and mental disorders (recoded so doesn't include drug/alcohol induced)
replace diag_ment = . if e_diag3 == "F18"
replace diag_alc = 1 if e_diag3 == "F18"

gen edment16_harm = ((e_my_p >= date16 & e_my_p < date17) ///
	& (diag_harm == 1 | diag_ment == 1))
bysort pslk3: egen everedment16_harm = max(edment16_harm)

* For drug/alcohol induced problems and poisonings
gen edment16_alc = ((e_my_p >= date16 & e_my_p < date17) ///
	& (diag_pois == 1 | diag_alc == 1))
bysort pslk3: egen everedment16_alc = max(edment16_alc)

**** ED visits related to injuries
* Construct variable for any ED visit at age 16 for injury
gen edinj16 = (e_my_p >= date16 & e_my_p < date17 & diag_inj == 1)
bysort pslk3: egen everedinj16 = max(edinj16)

**** ED visits for digestive system illness or issues 
* Construct indicator for digestive issues, unrelated to injury and mental illness
gen diag_digest = 1 if e_mdb_gr == "3C" & diag_inj != 1 & diag_harm_pois != 1 & diag_alc != 1 & diag_ment != 1 

* Construct variable for any ED visit at age 16 for digestive issues
gen eddigest16 = (e_my_p >= date16 & e_my_p < date17 & diag_digest == 1)
bysort pslk3: egen evereddigest16 = max(eddigest16)

**** ED visits for all other issues
* Construct indicator for other issues (not mental illness, injury, nor digestive)
gen diag_oth = 1 if diag_inj != 1 & diag_harm_pois != 1 & diag_alc != 1 & diag_ment != 1 & diag_digest != 1

* Construct variable for any ED visit at age 16 for other reasons
gen edoth16 = (e_my_p >= date16 & e_my_p < date17 & diag_oth == 1)
bysort pslk3: egen everedoth16 = max(edoth16)

* Save as intermediate data file (individual-by-visit level)
save "${tmp}ed_detailed_intermediate.dta", replace

* Collapse by individual ID and save intermediate file
collapse evered*, by(pslk3)
save "${tmp}ed_intermediate.dta", replace

*******************************************************************************
*	Section 4: Clean raw school absenteeism data (for analysis in Appendix C)
*******************************************************************************

* Import school absenteeism data into Stata (originally .sav) and save as .dta file
import spss "${raw}PSLK to Absence details (duplicates deleted and cleaned).sav", case(lower) clear
save "${raw}absences.dta", replace

* Load school absenteeism data
use "${raw}absences.dta", clear

* Rename individual identifiers to pslk3 for consistency with other datasets
rename _v1 pslk3

* Convert date of absence to workable Stata date variable
replace absent_date = dofc(absent_date)
format absent_date %td

* Keep only entries from the mid-year census (consistent across all students)
keep if census_type == "MID"

* Keep only necessary variables to improve computational speed
keep pslk3 absent_date census_year

* Merge with cohort file to get date of birth and then generate birthdays up to age 22
merge m:1 pslk3 using "${raw}ican_cohall.dta", keepusing(dob)
drop _merge

forval x = 15/22{
	gen date`x'y = year(dob) + `x'
	gen date`x'm = month(dob)
	gen date`x'd = day(dob)
	
	gen date`x' = mdy(date`x'm, date`x'd, date`x'y)
	* Shift birthday to March 1st in non-leap years for children born on 29th February
	replace date`x' = mdy(3, 1, date`x'y) if date`x' == .
	format date`x' %d
	
	drop date`x'y
	drop date`x'm
	drop date`x'd
}

/*
  Merge with intermediate schooling data to identify enrolment status at age 16 
  and then restrict sample to children observed in the public school census
  (see Section 1 of this .do file)
*/
merge m:1 pslk3 using "${tmp}educ_intermediate.dta", keepusing(rec_census15 enrol16)
drop _merge

keep if rec_census15 == 1

/*
  Construct indicator for absences recorded in the two terms for which children
  are observed between their 16th and 17th birthday.
*/
gen absent16 = (inrange(absent_date, date16, date17))
bysort pslk3: egen total_absent16 = sum(absent16)

/*
   Construct indicator for days present during the two school terms that children
   are observed for after their 16th birthday (on average 104 days of schooling
   across two terms). Those not enrolled in school at age 16 are assumed to have
   attended no days of school.
*/
gen days_present16 = 104 - total_absent16
replace days_present16 = 0 if enrol16 == 0

* Collapse to individual level
collapse days_present16, by(pslk3)

* Save as intermediate data file
save "${tmp}absences_intermediate.dta", replace

*******************************************************************************
*	Section 5: Merge intermediate datasets and construct main analytical file
*******************************************************************************

* Load iCAN cohort file
use "${raw}ican_cohall.dta", clear

* Generate birthdays between ages 15 and 22
gen monthofbirth = month(dob)
gen yearofbirth = year(dob)

forval x = 15/22{
	gen date`x'y = year(dob) + `x'
	gen date`x'm = month(dob)
	gen date`x'd = day(dob)
	
	gen date`x' = mdy(date`x'm, date`x'd, date`x'y)
	* Shift birthday to March 1st in non-leap years for children born on 29th February
	replace date`x' = mdy(3, 1, date`x'y) if date`x' == .
	format date`x' %d
	
	gen year`x' = yearofbirth + `x'
	
	drop date`x'y
	drop date`x'm
	drop date`x'd
}

*** Clean variables recorded at child's birth
* Sex - Recode to 0 for males and 1 for females (other values to missing)
recode sex (1 = 0) (2 = 1) (4 = .)

/*
  Low socioeconomic status - based on being in the bottom two deciles of the
  Index of Relative Socioeconomic Disadvantage (IRSD)
*/
replace adv_ses = . if missing(b_seifa_d)
replace adv_ses = 0 if b_seifa_d == 3 | b_seifa_d == 4

* Mother's marital status - Recode to 0 for married/de-facto and 1 for single
recode adv_m_marstat (1 = 0) (2 = 1) (3 = .)
replace adv_m_marstat = . if missing(mothstat) | mothstat == 9

* Mother's age (continuous) - Recode values 0 to 10 and > 200 to missing (outliers)
mvdecode mother_age, mv(0/10 218 220 332)

* Mother's age (binary) - Recode to 0 for age < 21 and 1 for maternal age >= 21
replace adv_m_age = . if missing(mother_age)
recode adv_m_age (2 = 0)

/*
  Congenital abnormalities - Flag any abnormality recorded in either the birth
  records or hospital records as a 1 (includes pediatric Complex Chronic Conditions)
*/
replace adv_cong = 1 if babyca1 > 0 | babyca2 > 0 | babyca3 > 0 | babyca4 > 0 | babyca5 > 0 | babyca6 > 0
replace adv_cong = . if missing(babyca1)
replace adv_cong = 1 if congeni_genetic_ccc == 1
replace adv_cong = 1 if adv_ccc == 1

* Gestation period (continous) - Recode value 99 to missing
mvdecode babygest, mv(99)

* Gestation period (binary) - 0 is >= 37 weeks and 1 is < 37 weeks
replace adv_b_gest = . if missing(babygest)

* Birthweight (continous) - Recode value 0 to missing
mvdecode babywght, mv(0)

* Birthweight (binary) - Recoded so that 0 is >= 2500g and 1 is < 2500g (low)
recode adv_wgt (2 = 0) (3 = .)
replace adv_wgt = . if missing(babywght)

**** Sample restrictions
* Restrict sample to children born between July 1987 and December 1996
keep if yearofbirth >= 1987 & yearofbirth <= 1996
drop if monthofbirth < 7 & yearofbirth == 1987

* Restrict sample to children born in South Australia
keep if ican_b_coh == 1

* Drop individuals with fetal or perinatal deaths (in birth records)
drop if babyoutc == 1 | babyoutc == 4

/*
  Drop individuals who died prior to the age of 21 (based on dates of death
  recorded in the National Death Index - NDI)
*/
replace month_of_death = mod_ndi if month_of_death == .
replace year_of_death = yod_ndi if year_of_death == .

gen date_death = mdy(month_of_death, 1, year_of_death)
format date_death %d

gen age_death = (date_death - dob)/365
replace age_death = floor(age_death)
drop if age_death < 21

* Keep only variables relevant for our analysis
keep pslk3 dob yearofbirth monthofbirth sex adv_ses adv_m_marstat mother_age adv_m_age ///
	adv_cong babygest adv_b_gest babywght adv_wgt

* Merge to intermediate data files constructed in Section 1 to Section 4
merge 1:1 pslk3 using "${tmp}educ_intermediate.dta"
drop if _merge == 2
drop _merge

merge 1:1 pslk3 using "${tmp}cps_intermediate.dta"
drop if _merge == 2
drop _merge

merge 1:1 pslk3 using "${tmp}ed_intermediate.dta"
drop if _merge == 2
drop _merge

merge 1:1 pslk3 using "${tmp}absences_intermediate.dta"
drop if _merge == 2
drop _merge

/*
  Now we construct the variables necessary for the RD analysis:
  1. Running variable - day-of-birth from the reform cut-off (i.e. 1 January 1993)
  2. Treatment indicator - born after the reform cut-off
  3. Interactions with past CPS involvement up to age 16
*/
* Generate running variable, day-of-birth from 1 January 1993
gen running_days = dob - mdy(1, 1, 1993)
gen h2 = running_days

* Generate weekly bins of the running variable (i.e. week-of-birth from 1 January 1993)
gen running_weeks = .

local a = -7
local b = 0

forval x=0/400{
	local a = `a' + 7
	local b = `b' + 7

	replace running_weeks = `x' if running_days >= `a' & running_days < `b'
}

local a = 0
local b = 7

forval x=-1(-1)-400{
	local a = `a' - 7
	local b = `b' - 7

	replace running_weeks = `x' if running_days >= `a' & running_days < `b'
}

gen h3 = running_weeks

* Generate monthly bins of the running variable (i.e. month-of-birth from January 1993)
gen running_months = .

local a = 0
local b = -12
local c = -24
local d = -36
local e = -48

forval x=0/48{
	local a = `a' + 1
	local b = `b' + 1
	local c = `c' + 1
	local d = `d' + 1
	
	if `x' < 12{
		replace running_months = `x' if monthofbirth == `a' & yearofbirth == 1993
	}
	if `x' >= 12 & `x' < 24{
		replace running_months = `x' if monthofbirth == `b' & yearofbirth == 1994
	}
	if `x' >= 24 & `x' < 36{
		replace running_months = `x' if monthofbirth == `c' & yearofbirth == 1995
	}
	if `x' >= 36 & `x' < 48{
		replace running_months = `x' if monthofbirth == `d' & yearofbirth == 1996
	}
	else{
		replace running_months = `x' if monthofbirth == `e' & yearofbirth == 1997
	}
}

local a = 0
local b = -12
local c = -24
local d = -36
local e = -48
local f = -60

forval x=-72/-1{
	local a = `a' + 1
	local b = `b' + 1
	local c = `c' + 1
	local d = `d' + 1
	local e = `e' + 1
	local f = `f' + 1
	
	if `x' >= -12{
		replace running_months = `x' if monthofbirth == `f' & yearofbirth == 1992
	}
	if `x' >= -24 & `x' <= -13{
		replace running_months = `x' if monthofbirth == `e' & yearofbirth == 1991
	}
	if `x' >= -36 & `x' <= -25{
		replace running_months = `x' if monthofbirth == `d' & yearofbirth == 1990
	}
	if `x' >= -48 & `x' <= -37{
		replace running_months = `x' if monthofbirth == `c' & yearofbirth == 1989
	}
	if `x' >= -60 & `x' <= -49{
		replace running_months = `x' if monthofbirth == `b' & yearofbirth == 1988
	}
	else{
		replace running_months = `x' if monthofbirth == `a' & yearofbirth == 1987
	}
}

gen h1 = running_months

* Generate treatment/reform indicator (i.e. born after 1 January 1993)
gen post1 = (h2 >= 0)

* Generate treatment/reform indicator interacted with past CPS involvement
gen nocps_post1 = (h2 >= 0 & cps_age16_binary == 0)
gen anycps_post1 = (h2 >= 0 & cps_age16_binary == 1)

* Generate individual month-of-birth indicators
tab monthofbirth, gen(mb)

* Label variables
label var sex "Female"
label var babywght "Birthweight (in grams)"
label var babygest "Gestation (in weeks)"
label var adv_wgt "Low birthweight (< 2500 grams)"
label var adv_m_age "Mother's age < 21 years old at birth"
label var adv_m_marstat "Mother not married/de facto at birth"
label var adv_b_gest "Low gestation (< 37 weeks)"
label var adv_ses "Low SES (bottom IRSD quintile)"
label var adv_cong "Any pediatric conditions"
label var mother_age "Mother's age (years) at birth"
label var dob "Date of birth"
label var monthofbirth "Month of birth"
label var yearofbirth "Year of birth"
label var rec_census15 "Schooling records available"
label var cps_age16_binary "Past CPS involvement up to age 16"
label var cps_age16_school "Past CPS involvement up to age 16 (school reporter)"
label var cps_age16_nonschool "Past CPS involvement up to age 16 (non-school reporter)"

label var enrol16 "School enrolment at 16"
label var enrol17 "School enrolment at 17"
label var leave16_vet "Left school at age 16 for VET"
label var leave16_employment "Left school at age 16 for employment"
label var leave16_other "Left school at age 16 for neither VET nor employment"
label var days_present16 "Days attended school at age 16"
label var cps_after16_binary "Any CPS notification after 16"
label var cps_after16_emo "Any CPS notification after 16 for emotional abuse"
label var cps_after16_phy "Any CPS notification after 16 for physical abuse"
label var cps_after16_sexual "Any CPS notification after 16 for sexual abuse"
label var cps_after16_neg "Any CPS notification after 16 for neglect"
label var cps_after16_school "Any CPS notification after 16 (school reporter)"
label var cps_after16_nonschool "Any CPS notification after 16 (non-school reporter)"
label var evered16 "Any ED visit at 16"
label var evered_17to21 "Any ED visit between 17 and 21"
label var everedinj16 "Any ED visit at 16 for injury"
label var everedment16 "Any ED visit at 16 for mental illness"
label var everedment16_harm "Any ED visit at 16 for mental illness (self-harm and poisonings)"
label var everedment16_alc "Any ED visit at 16 for mental illness (alcohol/drug abuse)"
label var evereddigest16 "Any ED visit at 16 for digestive system illness"
label var everedoth16 "Any ED visit at 16 for other reasons"

label var running_days "Running variable in days"
label var h2 "Running variable in days"
label var running_weeks "Running variable in weeks"
label var h3 "Running variable in weeks"
label var running_months "Running variable in months"
label var h1 "Running variable in months"
label var post1 "Reform"
label var nocps_post1 "Reform x No CPS"
label var anycps_post1 "Reform x Any CPS"

* Save as .dta file
save "${clean}analysis_final.dta", replace
